# Code to replicate tables and figures presented in HDSR draft

# Tables to replicate
# Table 1: Descriptive statistics of publication data for US based economics researchers. Panel A publication by all US-based researchers, Panel B publication by users of Census-curated confidential data descriptives_panels
# Table 2: Economics research using restricted-access microdata has higher scientific impact summary_results_02_01_a_REVISION
# Table 3: Economics research using restricted-access microdata has higher policy impact summary_policy_results_02_01_a_REVISION

# Appendix tables
# Table A1. Articles using restricted-access microdata are more likely to include a mix of senior and junior authors. table_age
# Table A2. Articles using restricted-access microdata are more likely to include authors from prestigious institutions. table_ranking
# Table A3. Articles using restricted-access microdata are more likely to include authors with previous Top 5 publications. team_quality_01_a


# Figures to replicate
# Figure 1.a. Articles using restricted-access data, 1.b. Share of articles using restricted-access data
# Figure 2. Share of articles using restricted-access microdata by research field. share_articles_field_fsrdc_04_REV
# Figure 3. Maximum age difference between co-authors of articles using restricted-access microdata. age_diff_between_coauthors_within_paper_01_v03_REV

# Appendix figures
# Figure C1. Yearly publications employing restricted-access microdata. RDC_trend
# Figure C2. Likelihood that paper employing restricted-access microdata appear in a top five journal . RDC_likelihood_top5
# Figure C3. Share of articles using restricted-access microdata by research field 1991-2000. share_articles_field_fsrdc_04_1991-2000
# Figure C4. Share of articles using restricted-access microdata by research field 2010-2019. share_articles_field_fsrdc_04_2010-2019

# Notes
# Variable NLS_use refers to whether the paper uses data from any of all NLS cohorts

# Author: Fernando Stipanicic, March 2025. 

# setwd("C:/Users/.../") # set working directory. this directory should contain folders code/, data/, output/


library(data.table)
library(ggplot2)
library(RColorBrewer)
library(dplyr)
library(xtable)
library(fixest)

source("./code/plot_theme.R") # standarized plot characteristics

pd <- position_dodge(0.3) # to shift to the side the points such that bars do not overlap in plot

getPalette = colorRampPalette(brewer.pal(9, "Set3")) # get colors from palette Set3. this function allows to extend the original palette of colors to have more colors
c25 <- c(
  "dodgerblue2", "#E31A1C", # red
  "green4",
  "#6A3D9A", # purple
  "#FF7F00", # orange
  "black", "gold1",
  "skyblue2", "#FB9A99", # lt pink
  "palegreen2",
  "#CAB2D6", # lt purple
  "#FDBF6F", # lt orange
  "gray70", "khaki2",
  "maroon", "orchid1", "deeppink1", "blue1", "steelblue4",
  "darkturquoise", "green1", "yellow4", "yellow3",
  "darkorange4", "brown"
)

# # load data
data_01 <- fread("data/data.csv")

### REPLICATION OF TABLES
# Table 1: Descriptive statistics of publication data for US based economics researchers. Panel A publication by all US-based researchers, Panel B publication by users of Census-curated confidential data descriptives_panels
# table 1 panel A
table_descriptive_01_a_r1 <- data_01[FSRDC_use==1,.(data_use="FSRDC", n_articles=uniqueN(article_id), n_authors=uniqueN(author_id), n_institutions=uniqueN(institution_id))]
table_descriptive_01_a_r1 <- cbind(table_descriptive_01_a_r1, data_01[!duplicated(data_01, by='article_id')][FSRDC_use==1][,.(avg_yearly_citations=round(mean(avg_yearly_citations),2))])
table_descriptive_01_a_r1 <- cbind(table_descriptive_01_a_r1, data_01[!duplicated(data_01, by='article_id')][FSRDC_use==1][top5==1,.(n_articles_top5=uniqueN(article_id), avg_yearly_citations_top5=round(mean(avg_yearly_citations),2))])
table_descriptive_01_a_r1 <- cbind(table_descriptive_01_a_r1, data_01[!duplicated(data_01, by='article_id')][FSRDC_use==1][,.(avg_year_n_policy_citations=round(mean(avg_year_policy_citations),2))])

table_descriptive_01_a_r2 <- data_01[NLS_use==1|PSID_use==1,.(data_use="NLS/PSID", n_articles=uniqueN(article_id), n_authors=uniqueN(author_id), n_institutions=uniqueN(institution_id))]
table_descriptive_01_a_r2 <- cbind(table_descriptive_01_a_r2, data_01[!duplicated(data_01, by='article_id')][(NLS_use==1|PSID_use==1)][,.(avg_yearly_citations=round(mean(avg_yearly_citations),2))])
table_descriptive_01_a_r2 <- cbind(table_descriptive_01_a_r2, data_01[!duplicated(data_01, by='article_id')][NLS_use==1|PSID_use==1][top5==1,.(n_articles_top5=uniqueN(article_id), avg_yearly_citations_top5=round(mean(avg_yearly_citations),2))])
table_descriptive_01_a_r2 <- cbind(table_descriptive_01_a_r2, data_01[!duplicated(data_01, by='article_id')][NLS_use==1|PSID_use==1][,.(avg_year_n_policy_citations=round(mean(avg_year_policy_citations),2))])

table_descriptive_01_a_r3 <- data_01[!(FSRDC_use==1|NLS_use==1|PSID_use==1),.(data_use="Other/None", n_articles=uniqueN(article_id), n_authors=uniqueN(author_id), n_institutions=uniqueN(institution_id))]
table_descriptive_01_a_r3 <- cbind(table_descriptive_01_a_r3, data_01[!duplicated(data_01, by='article_id')][!(FSRDC_use==1|NLS_use==1|PSID_use==1)][,.(avg_yearly_citations=round(mean(avg_yearly_citations),2))])
table_descriptive_01_a_r3 <- cbind(table_descriptive_01_a_r3, data_01[!duplicated(data_01, by='article_id')][!(FSRDC_use==1|NLS_use==1|PSID_use==1)][top5==1,.(n_articles_top5=uniqueN(article_id), avg_yearly_citations_top5=round(mean(avg_yearly_citations),2))])
table_descriptive_01_a_r3 <- cbind(table_descriptive_01_a_r3, data_01[!duplicated(data_01, by='article_id')][!(FSRDC_use==1|NLS_use==1|PSID_use==1)][,.(avg_year_n_policy_citations=round(mean(avg_year_policy_citations),2))])

table_descriptive_01_a_r4 <- data_01[,.(data_use="Total", n_articles=uniqueN(article_id), n_authors=uniqueN(author_id), n_institutions=uniqueN(institution_id))]
table_descriptive_01_a_r4 <- cbind(table_descriptive_01_a_r4, data_01[!duplicated(data_01, by='article_id')][,.(avg_yearly_citations=round(mean(avg_yearly_citations),2))])
table_descriptive_01_a_r4 <- cbind(table_descriptive_01_a_r4, data_01[!duplicated(data_01, by='article_id')][top5==1,.(n_articles_top5=uniqueN(article_id), avg_yearly_citations_top5=round(mean(avg_yearly_citations),2))])
table_descriptive_01_a_r4 <- cbind(table_descriptive_01_a_r4, data_01[!duplicated(data_01, by='article_id')][,.(avg_year_n_policy_citations=round(mean(avg_year_policy_citations),2))])

table_descriptive_01_a <- rbind(table_descriptive_01_a_r1, table_descriptive_01_a_r2, table_descriptive_01_a_r3, table_descriptive_01_a_r4)
table_descriptive_01_a[,n_articles_top5:=(n_articles_top5/n_articles)*100]
table_descriptive_01_a <- table_descriptive_01_a %>% rename(share_articles_top5 = n_articles_top5)

names(table_descriptive_01_a) <- c('\\makecell[c]{Type of\\\\microdata used}', '\\makecell[c]{N.\\\\Articles}', '\\makecell[c]{N.\\\\Authors}', "\\makecell[c]{N.\\\\Institutions}", "\\makecell[c]{Avg. yearly\\\\citations}", "\\makecell[c]{\\% Articles in\\\\Top 5 journal}", "\\makecell[c]{Avg. yearly cit.\\\\Top 5 journal}", "\\makecell[c]{Avg. yearly\\\\policy cit.}")
print(xtable(table_descriptive_01_a, label='table_descriptive_01_a', caption='Descriptive statistics of publication data for U.S.-based economics researchers. Panel A: Publications by all U.S.-based researchers'),latex.environments=c('center','widestuff'), size='\\fontsize{10pt}{12pt}\\selectfont', floating = T ,booktabs=TRUE, include.rownames = F, format.args=list(big.mark=","), sanitize.text.function = function(x) x, hline.after=c(-1,0,nrow(table_descriptive_01_a)-1,nrow(table_descriptive_01_a)), caption.placement = "top", file = 'output/tables/table_01_a.tex') 

# table 1 panel B
data_02 <- data_01[fsrdc_author==1]
table_descriptive_01_b_r1 <- data_02[FSRDC_use==1,.(data_use="FSRDC", n_articles=uniqueN(article_id))]
table_descriptive_01_b_r1 <- cbind(table_descriptive_01_b_r1, data_02[!duplicated(data_02, by='article_id')][FSRDC_use==1][,.(avg_yearly_citations=round(mean(avg_yearly_citations),2))])
table_descriptive_01_b_r1 <- cbind(table_descriptive_01_b_r1, data_02[!duplicated(data_02, by='article_id')][FSRDC_use==1][top5==1,.(n_articles_top5=uniqueN(article_id), avg_yearly_citations_top5=round(mean(avg_yearly_citations),2))])
table_descriptive_01_b_r1 <- cbind(table_descriptive_01_b_r1, data_02[!duplicated(data_02, by='article_id')][FSRDC_use==1][,.(avg_year_n_policy_citations=round(mean(avg_year_policy_citations),2))])

table_descriptive_01_b_r2 <- data_02[NLS_use==1|PSID_use==1,.(data_use="NLS/PSID", n_articles=uniqueN(article_id))]
table_descriptive_01_b_r2 <- cbind(table_descriptive_01_b_r2, data_02[!duplicated(data_02, by='article_id')][(NLS_use==1|PSID_use==1)][,.(avg_yearly_citations=round(mean(avg_yearly_citations),2))])
table_descriptive_01_b_r2 <- cbind(table_descriptive_01_b_r2, data_02[!duplicated(data_02, by='article_id')][NLS_use==1|PSID_use==1][top5==1,.(n_articles_top5=uniqueN(article_id), avg_yearly_citations_top5=round(mean(avg_yearly_citations),2))])
table_descriptive_01_b_r2 <- cbind(table_descriptive_01_b_r2, data_02[!duplicated(data_02, by='article_id')][NLS_use==1|PSID_use==1][,.(avg_year_n_policy_citations=round(mean(avg_year_policy_citations),2))])

table_descriptive_01_b_r3 <- data_02[!(FSRDC_use==1|NLS_use==1|PSID_use==1),.(data_use="Other/None", n_articles=uniqueN(article_id))]
table_descriptive_01_b_r3 <- cbind(table_descriptive_01_b_r3, data_02[!duplicated(data_02, by='article_id')][!(FSRDC_use==1|NLS_use==1|PSID_use==1)][,.(avg_yearly_citations=round(mean(avg_yearly_citations),2))])
table_descriptive_01_b_r3 <- cbind(table_descriptive_01_b_r3, data_02[!duplicated(data_02, by='article_id')][!(FSRDC_use==1|NLS_use==1|PSID_use==1)][top5==1,.(n_articles_top5=uniqueN(article_id), avg_yearly_citations_top5=round(mean(avg_yearly_citations),2))])
table_descriptive_01_b_r3 <- cbind(table_descriptive_01_b_r3, data_02[!duplicated(data_02, by='article_id')][!(FSRDC_use==1|NLS_use==1|PSID_use==1)][,.(avg_year_n_policy_citations=round(mean(avg_year_policy_citations),2))])

table_descriptive_01_b <- rbind(table_descriptive_01_b_r1, table_descriptive_01_b_r2, table_descriptive_01_b_r3)
table_descriptive_01_b[,n_articles_top5:=(n_articles_top5/n_articles)*100]
table_descriptive_01_b <- table_descriptive_01_b %>% rename(share_articles_top5 = n_articles_top5)

names(table_descriptive_01_b) <- c('\\makecell[c]{Type of\\\\microdata used}', '\\makecell[c]{N.\\\\Articles}', "\\makecell[c]{Avg. yearly\\\\citations}", "\\makecell[c]{\\% Articles in\\\\Top 5 journal}", "\\makecell[c]{Avg. yearly cit.\\\\Top 5 journal}", "\\makecell[c]{Avg. yearly\\\\policy cit.}")
print(xtable(table_descriptive_01_b, label='table_descriptive_01_b', caption='Descriptive statistics of publication data for U.S.-based economics researchers. Panel B: Publications by users of Census-curated confidential data'),latex.environments=c('center','widestuff'), size='\\fontsize{10pt}{12pt}\\selectfont', floating = T ,booktabs=TRUE, include.rownames = F, format.args=list(big.mark=","), sanitize.text.function = function(x) x, hline.after=c(-1,0,nrow(table_descriptive_01_b)), caption.placement = "top", file = 'output/tables/table_01_b.tex')

# Table 2: Economics research using restricted-access microdata has higher scientific impact
data_reg_01 <- data_01[FSRDC_use==1|NLS_use==1|PSID_use==1]

table_02_column1 <- feols(top5 ~ FSRDC_use | author_id + year^field, data=data_01[n_papers_per_author>1], fixef.rm = 'both', cluster=~author_id)
table_02_column2to3 <- feols(asinh(n_citations) ~ FSRDC_use | sw(author_id + year^field, author_id + year^field + journal) , data=data_01[n_papers_per_author>1], fixef.rm = 'both', cluster=~author_id)
table_02_column4 <- feols(top5 ~ FSRDC_use | author_id + year^field, data=data_reg_01[n_papers_per_author>1], fixef.rm = 'both', cluster=~author_id)
table_02_column5to6 <- feols(asinh(n_citations) ~ FSRDC_use | sw(author_id + year^field, author_id + year^field + journal) , data=data_reg_01[n_papers_per_author>1], fixef.rm = 'both', cluster=~author_id)

mean_lhs <- c(round(mean(data_01[n_papers_per_author>1][table_02_column1$obs_selection[[1]], top5]), 3), 
              rep(paste(round(mean(data_01[n_papers_per_author>1][table_02_column2to3$`fixef: author_id + year^field + journal`$obs_selection[[1]], n_citations]),0), '\ncitations', sep=""), 2),
              round(mean(data_reg_01[n_papers_per_author>1][table_02_column4$obs_selection[[1]], top5]), 3), 
              rep(paste(round(mean(data_reg_01[n_papers_per_author>1][table_02_column5to6$`fixef: author_id + year^field + journal`$obs_selection[[1]], n_citations]),0), '\ncitations', sep=""), 2))

notes_reg_01_b = "Results of estimating by OLS: $y_{iajft} = FSRDC_{i} + FE_{...} + \\epsilon_{iajft}$ for paper $i$, author $a$, journal $j$, field $f$ and year of publication $t$. In columns (1) and (4) the outcome variable $y_{iajft}$ is whether the paper was published in a top 5 journal (i.e. American Economic Review, Econometrica, Journal of Political Economy, Review of Economic Studies, The Quarterly Journal of Economics). In columns (2) to (3) and (5) to (6) the outcome variable $y_{iajft}$ is the inverse hyperbolic sine of the amount of citations that the paper $i$ has received. Each paper is classified into only one 16 economics fields. In order to identify the author fixed effect it is required that the author publishes more than one paper. We keep the sample of papers constant across regressions at the paper-author level by dropping single-paper authors. The amount of observations reported is the effective sample size."

etable(table_02_column1 ,table_02_column2to3$`fixef: author_id + year^field`, table_02_column2to3$`fixef: author_id + year^field + journal`,
table_02_column4 ,table_02_column5to6$`fixef: author_id + year^field`, table_02_column5to6$`fixef: author_id + year^field + journal`,
        extralines=list('_^Observation level'=c('paper-author', 'paper-author', 'paper-author', 'paper-author', 'paper-author', 'paper-author'), "__Mean LHS"=mean_lhs), 
       tex=T, digits=3, meta=~time+call, se.below = T, digits.stats ='r2', fitstat=c('n'), se.row = T,  dict=c(year='Year', field="Field", FSRDC_use='Uses restricted-access data (0/1)', "asinh(n_citations)"='asinh(citations received)', top5='Top 5 publication', author_id="Author", journal='Journal'), notes=paste("The sample in columns (1) to (3) includes all papers. In columns (4) to (6) includes only papers that used FSRDC, NLS or PSID data. ", notes_reg_01_b, sep=""), tpt=T, file='output/tables/table_02.tex', replace=T)

# Table 3: Economics research using restricted-access microdata has higher policy impact
data_reg_02 <- data_01[FSRDC_use==1|NLS_use==1|PSID_use==1]

table_03_column1 <- feols(as.numeric(n_policy_citations>0) ~ FSRDC_use | author_id + year^field, data=data_01[n_papers_per_author>1], fixef.rm = 'both', cluster=~author_id)
table_03_column2to3 <- feols(asinh(n_policy_citations) ~ FSRDC_use | sw(author_id + year^field, author_id + year^field + journal) , data=data_01[n_papers_per_author>1], fixef.rm = 'both', cluster=~author_id)
table_03_column4 <- feols(as.numeric(n_policy_citations>0) ~ FSRDC_use | author_id + year^field, data=data_reg_02[n_papers_per_author>1], fixef.rm = 'both', cluster=~author_id)
table_03_column5to6 <- feols(asinh(n_policy_citations) ~ FSRDC_use | sw(author_id + year^field, author_id + year^field + journal) , data=data_reg_02[n_papers_per_author>1], fixef.rm = 'both', cluster=~author_id)

mean_lhs <- c(round(mean(data_01[n_papers_per_author>1][table_03_column1$obs_selection[[1]], as.numeric(n_policy_citations>0)]), 3), 
              rep(paste(round(mean(data_01[n_papers_per_author>1][table_03_column2to3$`fixef: author_id + year^field + journal`$obs_selection[[1]], n_policy_citations]),0), '\ncitations', sep=""), 2),
              round(mean(data_reg_02[n_papers_per_author>1][table_03_column4$obs_selection[[1]], as.numeric(n_policy_citations>0)]), 3), 
              rep(paste(round(mean(data_reg_02[n_papers_per_author>1][table_03_column5to6$`fixef: author_id + year^field + journal`$obs_selection[[1]], n_policy_citations]),0), '\ncitations', sep=""), 2))

notes_reg_policy_01_b = "Results of estimating by OLS: $y_{iajft} = FSRDC_{i} + FE_{...} + \\epsilon_{iajft}$ for paper $i$, author $a$, journal $j$, field $f$ and year of publication $t$. In columns (1) and (4) the outcome variable $y_{iajft}$ is whether the paper received at least one citation from a policy document. In columns (2) to (3) and (5) to (6) the outcome variable $y_{iajft}$ is the inverse hyperbolic sine of the amount of policy citations that the paper $i$ has received. Each paper is classified into only one of 16 Economics fields. In order to identify the author fixed effect it is required that the author publishes more than one paper. We keep the sample of papers constant across regressions at the paper-author level by dropping single-paper authors. The amount of observations reported is the effective sample size."

etable(table_03_column1, table_03_column2to3$`fixef: author_id + year^field`, table_03_column2to3$`fixef: author_id + year^field + journal`,
table_03_column4, table_03_column5to6$`fixef: author_id + year^field`, table_03_column5to6$`fixef: author_id + year^field + journal`,
       extralines=list('_^Observation level'=c('paper-author', 'paper-author', 'paper-author', 'paper-author', 'paper-author', 'paper-author'), "__Mean LHS"=mean_lhs), 
       tex=T, digits=3, meta=~time+call, se.below = T, digits.stats ='r2', fitstat=c('n'), se.row = T,  dict=c(year='Year', field="Field", FSRDC_use='Uses restricted-access data (0/1)', "asinh(n_citations)"='asinh(citations received)', "asinh(n_policy_citations)"='asinh(policy citations received)', `as.numeric(n_policy_citations>0)`='Policy citation received 0/1', top5='Top 5 publication', author_id="Author", journal='Journal'), notes=paste("In columns (1) to (3) the sample includes all papers. In columns (4) to (6) the sample includes only papers that used FSRDC, NLS or PSID data. ", notes_reg_policy_01_b, sep=""), tpt=T, file='output/tables/table_03.tex', replace=T)


# Table A1. Articles using restricted-access microdata are more likely to include a mix of senior and junior authors
data_temp_01 <- data_01[, .(age_diff=max(author_firstyear)-min(author_firstyear), n_authors=uniqueN(author_id)),.(article_id, data_use, year, field)]
mod_age_11_b <- feols(age_diff ~ i(data_use, ref='PSID') | sw(year, year + field, year^field), data=data_temp_01[n_authors>1])
etable(mod_age_11_b, tex=T, digits=3, notes="The dependent variable is the maximum age difference between coauthors within a paper. The independent variable is a dummy for the type of data that the paper uses: PSID, NLS, FSRDC or unknown. PSID is the reference category. An observation is a paper. Only papers with more than one coauthor are included in the sample.", meta=~time+call, tabular='X', fitstat=c('n','my'), file='output/tables/table_A1.tex', replace=T)

# Table A2. Articles using restricted-access microdata are more likely to include authors from prestigious institutions
data_temp_01 <- data_01[rank_institution!=9999,.(min_rank_univ=min(rank_institution), n_authors=uniqueN(author_id)),.(article_id, FSRDC_use, year, field)]
mod_team_quality_univ_01 <- feols(FSRDC_use ~ log(min_rank_univ) | sw(year^field, year^field + n_authors), data=data_temp_01) #
etable(mod_team_quality_univ_01, dict=c(year='Year', field="Field", FSRDC_use='Uses restricted-access data (0/1)', "asinh(n_citations)"='asinh(citations received)', "asinh(n_policy_citations)"='asinh(policy citations received)', `as.numeric(n_policy_citations>0)`='Policy citation received 0/1', top5='Top 5 publication', author_id="Author", journal='Journal', n_authors='Number of authors', `log(min_rank_univ)`="log(min university rank)"), tex=T, digits=3, meta=~time+call, tabular='X', fitstat=c('n','my'), file='output/tables/table_A2.tex', replace=T)

# Table A3. Articles using restricted-access microdata are more likely to include authors with previous Top 5 publications
data_temp_01 <- data_01[, .(first_author_year_top5=min(author_firstyear_top5), n_authors=uniqueN(author_id), age_diff=max(author_firstyear)-min(author_firstyear), avg_coauthor_age = mean(coauthor_age), max_coauthor_age=max(coauthor_age)),.(article_id, FSRDC_use, year, field)]
data_temp_01[,const:=1]
data_temp_01[,author_previous_top5:=first_author_year_top5<year]

mod_team_quality_01 <- feols(FSRDC_use ~ author_previous_top5 | sw(year^field, year^field + n_authors, year^field + n_authors + const[[avg_coauthor_age]], year^field + n_authors + const[[avg_coauthor_age]] + const[[age_diff]], year^field + n_authors + const[[avg_coauthor_age]] + const[[age_diff]] + const[[max_coauthor_age]]), data=data_temp_01)
etable(mod_team_quality_01, tex=T, digits=3, notes="The dependent variable is a dummy for whether the  paper uses FSRDC data. The independent variable is whether the paper's coauthorship team had at least one author with a paper published in a top 5 journal previous to the publication of the FSRDC paper in question. Controls (added linearly, as slopes) for average age of coauthors in team and maximum age difference within team are added. An observation is a paper. All papers are included in the sample.", meta=~time+call, replace=T, tabular='X', fitstat=c('n','my'), dict=c(year='Year', field="Field", FSRDC_use='Uses restricted-access data (0/1)', "asinh(n_citations)"='asinh(citations received)', "asinh(n_policy_citations)"='asinh(policy citations received)', `as.numeric(n_policy_citations>0)`='Policy citation received 0/1', top5='Top 5 publication', author_id="Author", journal='Journal', n_authors='Number of authors', `log(min_rank_univ)`="log(min university rank)", `author_previous_top5TRUE`="Coauthor with top 5 publication", `avg_coauthor_age (const)`='Average coauthor age', `age_diff (const)`="Max age difference", `max_coauthor_age (const)`="Max age coauthor"), file='output/tables/table_A3.tex')



### REPLICATION OF FIGURES
# Figure 1.a. Articles using restricted-access data
temp_01_noFSRDC <- data_01[FSRDC_use==0,.(n_papers_y_field=uniqueN(article_id), n_authors_y_field=uniqueN(author_id), n_authors_papers_y_field=.N),.(year, field)][,.(field, n_authors_y_field, n_authors_y=sum(n_authors_y_field), n_papers_y_field, n_papers_y=sum(n_papers_y_field), n_authors_papers_y=sum(n_authors_papers_y_field), n_authors_papers_y_field), .(year)]
temp_01_noFSRDC[,`:=`(n_authors_per_paper_y_field=n_authors_y_field/n_papers_y_field, 
              n_papers_per_author_y_field=n_authors_papers_y_field/n_authors_y_field)]#,
temp_01_noFSRDC[, FSRDC:=0]

temp_01_yesFSRDC <- data_01[FSRDC_use==1,.(n_papers_y_field=uniqueN(article_id), n_authors_y_field=uniqueN(author_id), n_authors_papers_y_field=.N),.(year, field)][,.(field, n_authors_y_field, n_authors_y=sum(n_authors_y_field), n_papers_y_field, n_papers_y=sum(n_papers_y_field), n_authors_papers_y=sum(n_authors_papers_y_field), n_authors_papers_y_field), .(year)]
temp_01_yesFSRDC[,`:=`(n_authors_per_paper_y_field=n_authors_y_field/n_papers_y_field, 
                      n_papers_per_author_y_field=n_authors_papers_y_field/n_authors_y_field)]#,
temp_01_yesFSRDC[, FSRDC:=1]

temp_01_FSRDC <- rbind(temp_01_yesFSRDC, temp_01_noFSRDC)
temp_01_FSRDC[, FSRDC:=factor(FSRDC, levels=c(0, 1),labels = c("Does not use FSRDC data", "Uses FSRDC data"))]
temp_01_FSRDC <- merge(temp_01_FSRDC, temp_01_FSRDC[!duplicated(temp_01_FSRDC, by=c('year', "FSRDC")),.(n_papers_y_total=sum(n_papers_y)), .(year)], by='year', how='left')

temp_01_FSRDC[,share_papers_field_y:=n_papers_y_field/n_papers_y]

plot_temp <- ggplot(temp_01_FSRDC[!duplicated(temp_01_FSRDC, by=c('year', "FSRDC"))][FSRDC=="Uses FSRDC data"], aes(x=year, y=n_papers_y)) +
  geom_point(size=3, color=c25[1]) +
  geom_line(linewidth=1.5, color=c25[1]) +
  plot_theme +
  xlab("Year") +
  ylab("Number of articles") +
  scale_x_continuous(breaks = c(1991, seq(1995,2015,5),2019)) +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.text=element_text(size=18),
        axis.title.x=element_text(size=20),
        axis.title.y=element_text(size=20))
ggsave(plot= plot_temp, filename="output/plots/figure_01a.png", width = 8, height = 6)

# 1.b. Share of articles using restricted-access data
plot_temp <- ggplot(temp_01_FSRDC[!duplicated(temp_01_FSRDC, by=c('year', "FSRDC")),][FSRDC=="Uses FSRDC data", .(share_papers_y=n_papers_y/n_papers_y_total),.(year)], aes(x=year, y=share_papers_y*100)) +
    geom_point(size=3, color=c25[1]) +
    geom_line(linewidth=1.5, color=c25[1]) +
    plot_theme +
    xlab("Year") +
    ylab("Share of articles (%)") +
    scale_x_continuous(breaks = c(1991, seq(1995,2015,5),2019)) +
    theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.text=element_text(size=18),
        axis.title.x=element_text(size=20),
        axis.title.y=element_text(size=20))
ggsave(plot= plot_temp, filename="output/plots/figure_01b.png", width = 8, height = 6)


# Figure 2. Share of articles using restricted-access microdata by research field
temp_01_noFSRDC_02 <- data_01[FSRDC_use==0,.(n_papers_y_field=uniqueN(article_id), n_authors_y_field=uniqueN(author_id), n_authors_papers_y_field=.N),.(year, field)][,.(field, n_authors_y_field, n_authors_y=sum(n_authors_y_field), n_papers_y_field, n_papers_y=sum(n_papers_y_field), n_authors_papers_y=sum(n_authors_papers_y_field), n_authors_papers_y_field), .(year)]
temp_01_noFSRDC_02[,`:=`(n_authors_per_paper_y_field=n_authors_y_field/n_papers_y_field, 
                        n_papers_per_author_y_field=n_authors_papers_y_field/n_authors_y_field)]#,
temp_01_noFSRDC_02[, FSRDC:=0]

temp_01_yesFSRDC_02 <- data_01[FSRDC_use==1,.(n_papers_y_field=uniqueN(article_id), n_authors_y_field=uniqueN(author_id), n_authors_papers_y_field=.N),.(year, field)][,.(field, n_authors_y_field, n_authors_y=sum(n_authors_y_field), n_papers_y_field, n_papers_y=sum(n_papers_y_field), n_authors_papers_y=sum(n_authors_papers_y_field), n_authors_papers_y_field), .(year)]
temp_01_yesFSRDC_02[,`:=`(n_authors_per_paper_y_field=n_authors_y_field/n_papers_y_field, 
                        n_papers_per_author_y_field=n_authors_papers_y_field/n_authors_y_field)]#,
temp_01_yesFSRDC_02[, FSRDC:=1]

temp_01_FSRDC_02 <- rbind(temp_01_yesFSRDC_02, temp_01_noFSRDC_02)
temp_01_FSRDC_02[, FSRDC:=factor(FSRDC, levels=c(0, 1),labels = c("Does not use FSRDC data", "Uses FSRDC data"))]
temp_01_FSRDC_02 <- merge(temp_01_FSRDC_02, temp_01_FSRDC_02[!duplicated(temp_01_FSRDC_02, by=c('year', "FSRDC")),.(n_papers_y_total=sum(n_papers_y)), .(year)], by='year', how='left')
temp_01_FSRDC_02[,share_papers_field_y:=n_papers_y_field/n_papers_y]


data_plot_03 <- temp_01_FSRDC_02[,.(n_papers_y_field=sum(n_papers_y_field)),.(field, FSRDC)][,.(field, n_papers_y_field, share_papers_field_y=n_papers_y_field/sum(n_papers_y_field)), .(FSRDC)]
fields_to_aggregate_to_other <- as.character(unique(data_plot_03[share_papers_field_y>0.05, field])) # necessary for appendix figures C3 and C4
data_plot_03[,field_03:=field]
data_plot_03[!(field %in% as.character(unique(data_plot_03[share_papers_field_y>0.05, field]))), field_03:='Other'] # aggregate fields with low shares to "Other"
data_plot_03[field_03=="Fin", field_03:="Finance"]
data_plot_03[field_03=="Macro", field_03:="Macroeconomics"]
data_plot_03[field_03=="Metrics", field_03:="Econometrics"]
data_plot_03[field_03=="IO", field_03:="Industrial\nOrganization"]
data_plot_03[field_03=="Intl", field_03:="International\nEconomics"]
data_plot_03[field_03=="Micro theory", field_03:="Microeconomics\nTheory"]
data_plot_03[field_03=="Micro empirical", field_03:="Microeconomics\nEmpirical"]
data_plot_03[field_03=="Labor", field_03:="Labor\nEconomics"]
data_plot_03 <- data_plot_03[,.(n_papers_y_field = sum(n_papers_y_field)),. (field_03, FSRDC)]
data_plot_03[,n_papers_y_total:=sum(n_papers_y_field),.(FSRDC)]
data_plot_03[, share_papers_field_y:=n_papers_y_field/n_papers_y_total]
data_plot_03[, field3:=factor(field_03, levels=c(data_plot_03[field_03!='Other'&FSRDC=='Uses FSRDC data'][order(-share_papers_field_y), field_03], "Other"))] # reordering fields by their share. necessary to display them in plot according to highest to smallest share
field3_ordering <- c(data_plot_03[field_03!='Other'&FSRDC=='Uses FSRDC data'][order(-share_papers_field_y), field_03], "Other")
data_plot_03[, FSRDC3:='Uses restricted-access data']
data_plot_03[FSRDC=='Does not use FSRDC data', FSRDC3:='Does not use restricted-access data']
data_plot_03[, FSRDC4:=factor(FSRDC3, levels=c("Uses restricted-access data", "Does not use restricted-access data"))]

colourCount_03 <- length(unique(data_plot_03[, field_03]))
color_fields_03 <- transpose(data.table(getPalette(colourCount_03)))
names(color_fields_03) <- sort(unique(data_plot_03[, field_03]))

plot_temp <- ggplot(data_plot_03, aes(x=field3, y=share_papers_field_y*100, fill=FSRDC4)) +
    geom_bar(stat="identity", color="black", position=position_dodge(), width=0.8)+
    scale_fill_brewer(palette="Blues", direction=-1)+
    plot_theme +
    ylab("Share of articles in field (%)") +
    theme(axis.title.x =element_blank(), line = element_blank(), axis.ticks.y=element_line(), legend.position="top", legend.text=element_text(size=20), axis.text.x = element_text(size=16), axis.text.y = element_text(size=18), axis.title.y = element_text(size=20))
ggsave(plot= plot_temp, filename="output/plots/figure_02.png", width = 16, height = 6)

# Extract the Y axis limits and breaks from plot_temp. to be used in plots by decade
plot_build <- ggplot_build(plot_temp)
y_limits <- plot_build$layout$panel_scales_y[[1]]$range$range
y_ticks <- plot_build$layout$panel_scales_y[[1]]$breaks


# Figure 3. Maximum age difference between co-authors of articles using restricted-access microdata
data_temp_01 <- data_01[, .(age_diff=max(author_firstyear)-min(author_firstyear), n_authors=uniqueN(author_id)),.(article_id, FSRDC_use)]
data_temp_01[, FSRDC:=factor(FSRDC_use, levels=c(1,0),labels = c("Uses restricted-access data", "Does not use restricted-access data"))]

plot_temp <- ggplot(data_temp_01[n_authors>1], aes(x=age_diff, fill=FSRDC)) + 
  geom_density(alpha=.3, bw=3) +
  scale_fill_manual(values = c(c25[2],c25[1])) +
  plot_theme +
  theme(legend.position = c(0.7,0.8)) +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.text=element_text(size=18),
        # axis.title=element_text(size=20),
        axis.title.x=element_text(size=20),
        axis.title.y=element_text(size=20),
        legend.text=element_text(size=16)) +
  xlab("Maximum age difference between co-authors (years)") +
  ylab("Density")
ggsave(plot= plot_temp, filename="output/plots/figure_03.png", width = 8, height = 6)

# Appendix figures
# Figure C1. Yearly publications employing restricted-access microdata.
data_temp_01 <- data_01[FSRDC_use==1,.(n_fsrdc_papers=uniqueN(article_id)),.(year)]

reg_01 <- feols(n_fsrdc_papers ~ year, data = data_temp_01)
reg_coef <- coef(reg_01)[2]
reg_se <- sqrt(vcov(reg_01)[2, 2])
reg_pval <- summary(reg_01)$coeftable[2, 4]
sig_level <- ifelse(reg_pval < 0.00001, "< 0.00001", "")

plot_temp <- ggplot(data_temp_01, aes(x = year, y = n_fsrdc_papers)) +
    geom_point(size=4, shape=19, color=c25[1]) +
    geom_smooth(method = "lm", se = FALSE, color = c25[2]) +
    labs(x = "Year of Publication",
    y = "Number of FSRDC Papers"
    ) +
    plot_theme +
    scale_x_continuous(breaks = c(1991, seq(1995,2015,5),2019)) +
    annotate("text", x = 1995, y = 35, 
            label = paste0("Coef: ", round(reg_coef, 3), "\nSE: ", round(reg_se, 3), 
                            "\np-val: ", sig_level), 
            hjust = 0, size = 5) +
    theme(panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text=element_text(size=18),
    axis.title.x=element_text(size=20),
    axis.title.y=element_text(size=20))
ggsave(plot= plot_temp, filename="output/plots/figure_C1.png", width = 8, height = 6)


# Figure C2. Likelihood that paper employing restricted-access microdata appear in a top five journal
data_temp_01 <- data_01[FSRDC_use==1,.(n_fsrdc_papers=uniqueN(article_id)),.(year)]
data_temp_01 <- merge(data_temp_01, data_01[FSRDC_use==1&top5==1,.(n_fsrdc_papers_top5=uniqueN(article_id)),.(year)], by='year', all=T)
data_temp_01[is.na(n_fsrdc_papers_top5),n_fsrdc_papers_top5:=0]
data_temp_01[, share_top5:=n_fsrdc_papers_top5/n_fsrdc_papers]

reg_02 <- feols(share_top5 ~ year, data = data_temp_01)
reg_coef <- coef(reg_02)[2]
reg_se <- sqrt(vcov(reg_02)[2, 2])
reg_pval <- summary(reg_02)$coeftable[2, 4]
sig_level <- ifelse(reg_pval < 0.00001, "< 0.00001", round(reg_pval,2))

plot_temp <- ggplot(data_temp_01, aes(x = year, y = share_top5)) +
    geom_point(size=4, shape=19, color=c25[1]) +
    geom_smooth(method = "lm", se = FALSE, color = c25[2]) +
    labs(x = "Year of Publication",
    y = "Number of FSRDC Papers"
    ) +
    plot_theme +
    scale_x_continuous(breaks = c(1991, seq(1995,2015,5),2019)) +
    annotate("text", x = 2005, y = 0.45, 
            label = paste0("Coef: ", format(round(reg_coef, 4), scientific = FALSE), "\nSE: ", round(reg_se, 3), 
                            "\np-val: ", sig_level), 
            hjust = 0, size = 5) +
    theme(panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text=element_text(size=18),
    axis.title.x=element_text(size=20),
    axis.title.y=element_text(size=20))
ggsave(plot= plot_temp, filename="output/plots/figure_C2.png", width = 8, height = 6)


# Figure C3. Share of articles using restricted-access microdata by research field 1991-2000. 
data_plot_03 <- temp_01_FSRDC_02[,.(n_papers_y_field=sum(n_papers_y_field)),.(field, FSRDC, year_bin=ifelse(year>=1991&year<2001, "1991-2000", ifelse(year>=2001&year<2010, "2001-2009", ifelse(year>=2010&year<2020, "2010-2019", NA))))][,.(field, n_papers_y_field, share_papers_field_y=n_papers_y_field/sum(n_papers_y_field)), .(FSRDC, year_bin)]
data_plot_03[,field_03:=field]
data_plot_03[!(field %in% fields_to_aggregate_to_other), field_03:='Other']
data_plot_03[field_03=="Fin", field_03:="Finance"]
data_plot_03[field_03=="Macro", field_03:="Macroeconomics"]
data_plot_03[field_03=="Metrics", field_03:="Econometrics"]
data_plot_03[field_03=="IO", field_03:="Industrial\nOrganization"]
data_plot_03[field_03=="Intl", field_03:="International\nEconomics"]
data_plot_03[field_03=="Micro theory", field_03:="Microeconomics\nTheory"]
data_plot_03[field_03=="Micro empirical", field_03:="Microeconomics\nEmpirical"]
data_plot_03[field_03=="Labor", field_03:="Labor\nEconomics"]
data_plot_03 <- data_plot_03[,.(n_papers_y_field = sum(n_papers_y_field)),. (field_03, FSRDC, year_bin)]
data_plot_03[,n_papers_y_total:=sum(n_papers_y_field),.(FSRDC, year_bin)]
data_plot_03[, share_papers_field_y:=n_papers_y_field/n_papers_y_total]
data_plot_03[, field3:=factor(field_03, levels=field3_ordering)] # reordering fields by their share in Figure 2
data_plot_03[, FSRDC3:='Uses Census data']
data_plot_03[FSRDC=='Does not use FSRDC data', FSRDC3:='Does not use Census data']
data_plot_03[, FSRDC4:=factor(FSRDC3, levels=c("Uses Census data", "Does not use Census data"))]

y_limits2 <- c(min(y_limits), max(max(y_limits),data_plot_03[,max(share_papers_field_y*100*1.1)]))

plot_temp <- ggplot(data_plot_03[year_bin=="1991-2000"], aes(x=field3, y=share_papers_field_y*100, fill=FSRDC4)) +
    geom_bar(stat="identity", color="black", position=position_dodge(), width=0.8)+
    scale_fill_brewer(palette="Blues", direction=-1)+
    plot_theme +
    ylab("Share of articles in field (%)") +
    scale_y_continuous(limits = y_limits2, breaks = y_ticks) + 
    theme(axis.title.x =element_blank(), line = element_blank(), axis.ticks.y=element_line(), legend.position="top", legend.text=element_text(size=20), axis.text.x = element_text(size=16), axis.text.y = element_text(size=18), axis.title.y = element_text(size=20))
ggsave(plot= plot_temp, filename=paste("output/plots/figure_C3.png", sep=""), width = 16, height = 6)

# Figure C4. Share of articles using restricted-access microdata by research field 2010-2019
plot_temp <- ggplot(data_plot_03[year_bin=="2010-2019"], aes(x=field3, y=share_papers_field_y*100, fill=FSRDC4)) +
    geom_bar(stat="identity", color="black", position=position_dodge(), width=0.8)+
    scale_fill_brewer(palette="Blues", direction=-1)+
    plot_theme +
    ylab("Share of articles in field (%)") +
    scale_y_continuous(limits = y_limits2, breaks = y_ticks) + 
    theme(axis.title.x =element_blank(), line = element_blank(), axis.ticks.y=element_line(), legend.position="top", legend.text=element_text(size=20), axis.text.x = element_text(size=16), axis.text.y = element_text(size=18), axis.title.y = element_text(size=20))
ggsave(plot= plot_temp, filename=paste("output/plots/figure_C4.png", sep=""), width = 16, height = 6)

